import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline
tammi = pd.read_excel("vuosi-2019-autot.xls", header=None, sheet_name=1, skiprows=[0,1])
helmi = pd.read_excel("vuosi-2019-autot.xls", header=None, sheet_name=2, skiprows=[0,1])
maalis = pd.read_excel("vuosi-2019-autot.xls", header=None, sheet_name=3, skiprows=[0,1])
huhti = pd.read_excel("vuosi-2019-autot.xls", header=None, sheet_name=4, skiprows=[0,1])
#2015-2018 henkilöautovero
df = pd.DataFrame()
for i in range(1,13):
data = pd.read_excel("vuosi-2018_autot.xls", header=None, sheet_name=i, skiprows=[0,1])
data2 = pd.read_excel("vuosi-2017-autot.xls", header=None, sheet_name=i, skiprows=[0,1])
data3 = pd.read_excel("vuosi-2016-autot.xls", header=None, sheet_name=i, skiprows=[0,1])
data4 = pd.read_excel("vuosi-2015_autot.xls", header=None, sheet_name=i, skiprows=[0,1])
df = df.append(data)
df = df.append(data2)
df = df.append(data3)
df = df.append(data4)
df.columns = ['Merkki', 'Malli', 'Mallin tarkennin', 'Kunto A=Alennettu',
'Päätöspäivä', 'Käyttöönottopvä', 'Ajokm/1000', 'Verotusarvo',
'Autovero']
df.head()
tammi.head()
helmi.head()
maalis.head()
huhti.head()
v2019 = pd.concat([tammi, helmi, maalis, huhti])
v2019.columns = ['Merkki', 'Malli', 'Mallin tarkennin', 'Kunto A=Alennettu',
'Päätöspäivä', 'Käyttöönottopvä', 'Ajokm/1000', 'Verotusarvo',
'Autovero']
v2019.head()
verot = pd.concat([v2019, df])
verot.head()
verot.isna().sum()
verot.drop("Kunto A=Alennettu",1,inplace=True)
verot.reset_index(drop=True, inplace=True)
verot.Päätöspäivä = pd.to_datetime(verot.Päätöspäivä, format='%Y%m%d')
verot.Käyttöönottopvä = pd.to_datetime(verot.Käyttöönottopvä, format='%Y%m%d')
verot["ika"] = (verot.Päätöspäivä - verot.Käyttöönottopvä).dt.days
verot.head()
verot.Merkki.value_counts().plot(kind="bar", figsize=(35,7))
new_df = verot[verot.Merkki == "MERCEDES-BENZ"]
new_df.head()
new_df.Malli.describe()
new_df.Malli.value_counts().head()
tuoduin = new_df[new_df["Malli"] == "C 220"]
tuoduin.reset_index(drop=True, inplace=True)
tuoduin.head()
tuoduin.isna().sum()
tuoduin.dtypes
tuoduin["Ajokm/1000"] = tuoduin["Ajokm/1000"].astype(int)
tuoduin.dtypes
tuoduin["Mallin tarkennin"].value_counts().head()
tuoduin_malli_tarkennin = new_df[new_df["Mallin tarkennin"] == "2.1 D 5D STW AUT 125KW"]
dummies_for_machine_learning = pd.get_dummies(tuoduin["Mallin tarkennin"], drop_first=True)
dflin = pd.concat([tuoduin["Ajokm/1000"], tuoduin["Verotusarvo"], tuoduin["Autovero"], tuoduin["ika"], dummies_for_machine_learning],axis=1)
dflin.reset_index(drop=True, inplace=True)
dflin.head()
dflin.isna().sum().sum()
b = intercept
y = m1x1 + m2x2 + m3x3... + b
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
X = dflin.drop(["Autovero", "Verotusarvo"],1)
y = dflin["Autovero"]
X_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)
regressor = LinearRegression()
regressor.fit(X_train, y_train)
regressor.coef_
regressor.intercept_
y_pred = regressor.predict(x_test)
df = pd.DataFrame({"Actual": y_test, "Prediction": y_pred})
df.head(10)
df1 = df.copy()
df1.reset_index(drop=True, inplace=True)
df1.plot(figsize=(25,10))
df2 = df.head(35)
df2.plot(kind="bar",figsize=(20,10))
plt.grid(which="major", linestyle="-", linewidth="0.5", color="green")
plt.grid(which="minor", linestyle=":", linewidth="0.5", color="black")
fig, ax = plt.subplots()
ax.scatter(y_test, y_pred)
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "k--", lw=4)
ax.set_xlabel("Actual")
ax.set_ylabel("Prediction")
plt.show()
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)
percent_diff = np.abs(y_pred - y_test) / y_test
result = (percent_diff < 0.2).sum() / len(y_test)
print('Percent within 20%% of target value: %.2f' % result)
def mreg(df, automalli):
malli = df[df["Malli"] == automalli]
mallitarkennin = pd.get_dummies(malli["Mallin tarkennin"], drop_first=True)
concated = pd.concat([malli, mallitarkennin], axis=1)
X2 = concated.drop(["Merkki", "Malli", "Mallin tarkennin", "Verotusarvo", "Autovero"],1)
y2 = concated["Autovero"]
X_train2, x_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=42)
regressor2 = LinearRegression()
regressor2.fit(X_train2, y_train2)
y_pred2 = regressor2.predict(x_test2)
print(malli.Merkki.unique() ,automalli, "Autoveroennuste:")
r2 = r2_score(y_test2, y_pred2)
print("accuracy", r2)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test2, y_pred2))
print('Mean Squared Error:', metrics.mean_squared_error(y_test2, y_pred2))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test2, y_pred2)))
df = pd.DataFrame({"Actual": y_test2, "Prediction": y_pred2})
df2 = df.head(35)
df2.plot(kind="bar",figsize=(20,10), title = automalli)
plt.grid(which="major", linestyle="-", linewidth="0.5", color="green")
plt.grid(which="minor", linestyle=":", linewidth="0.5", color="black")
fig, ax = plt.subplots()
ax.scatter(y_test2, y_pred2)
ax.plot([y_test2.min(), y_test2.max()], [y_test2.min(), y_test2.max()], "k--", lw=4)
ax.set_xlabel("Actual")
ax.set_title("Testi-vs-Ennnusteet")
ax.set_ylabel("Prediction")
plt.show()
print("--------------------------------------------------------------------------------------------------------------")
top10_autot = verot.Malli.value_counts().head(10).index.tolist()
verot.drop(["Päätöspäivä", "Käyttöönottopvä"],1,inplace=True)
verot.dropna(inplace=True)
for i in top10_autot:
mreg(verot, str(i))
from mpl_toolkits.mplot3d import Axes3D
import statsmodels.api as sm
#%matplotlib notebook
X = tuoduin_malli_tarkennin[['ika', 'Ajokm/1000']]
y = tuoduin_malli_tarkennin['Autovero']
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
xx1, xx2 = np.meshgrid(np.linspace(X.ika.min(), X.ika.max(), 100),
np.linspace(X['Ajokm/1000'].min(), X['Ajokm/1000'].max(), 100))
Z = est.params[0] + est.params[1] * xx1 + est.params[2] * xx2
fig = plt.figure()
ax = Axes3D(fig, azim=-115, elev=15)
surf = ax.plot_surface(xx1, xx2, Z, cmap=plt.cm.RdBu_r, alpha=0.9, linewidth=0)
resid = y - est.predict(X)
ax.scatter(X[resid >= 0].ika, X[resid >= 0]['Ajokm/1000'], y[resid >= 0], color='black', alpha=1.0, facecolor='white')
ax.scatter(X[resid < 0].ika, X[resid < 0]['Ajokm/1000'], y[resid < 0], color='black', alpha=1.0)
ax.set_xlabel("Auton käyttöikä")
ax.set_ylabel("Ajokm/1000")
ax.set_zlabel("Autovero")
ax.set_title("MERCEDES-BENZ 2.1 D 5D STW AUT 125KW")
plt.show(fig)
import plotly.graph_objs as go
import plotly.offline as offline
offline.init_notebook_mode()
X = tuoduin_malli_tarkennin[['ika', 'Ajokm/1000']]
y = tuoduin_malli_tarkennin['Autovero']
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
xx1, xx2 = np.meshgrid(np.linspace(X.ika.min(), X.ika.max(), 100),
np.linspace(X['Ajokm/1000'].min(), X['Ajokm/1000'].max(), 100))
Z = est.params[0] + est.params[1] * xx1 + est.params[2] * xx2
surface = go.Surface(
x = xx1,
y = xx2,
z = Z,
showscale = False,
colorscale = "Greys",
opacity = 0.9
)
resid = y - est.predict(X)
trace = go.Scatter3d(
x = X[resid >= 0].ika,
y = X[resid >= 0]['Ajokm/1000'],
z = y[resid >= 0],
mode = "markers",
marker = dict(size = 2, color = "white", opacity = 1, line = dict(color = "black", width = 0.2))
)
trace2 = go.Scatter3d(
x = X[resid < 0].ika,
y = X[resid < 0]['Ajokm/1000'],
z = y[resid < 0],
mode = "markers",
marker = dict(size = 2, color = "black", opacity = 1)
)
layout = go.Layout(
title="MERCEDES-BENZ 2.1 D 5D STW AUT 125KW",
scene = dict(
xaxis = dict(title = "Auton käyttöikä",
backgroundcolor="rgb(200, 200, 230)",
gridcolor="rgb(255, 255, 255)",
showbackground=True,
zerolinecolor="rgb(255, 255, 255)"
),
yaxis = dict(title = "Ajokm/1000",
backgroundcolor="rgb(200, 200, 230)",
gridcolor="rgb(255, 255, 255)",
showbackground=True,
zerolinecolor="rgb(255, 255, 255)"
),
zaxis = dict(title = "Autovero",
backgroundcolor="rgb(200, 200, 230)",
gridcolor="rgb(255, 255, 255)",
showbackground=True,
zerolinecolor="rgb(255, 255, 255)",
),
),
width = 900,
height = 900,
autosize = False,
showlegend = False
)
data = [surface, trace, trace2]
figure = dict(data = data, layout = layout)
offline.iplot(figure)